import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_json(r"D:\Dinesh\myprojects\loan_approval_dataset.json\loan_approval_dataset.json")
df.head()
| Id | Income | Age | Experience | Married/Single | House_Ownership | Car_Ownership | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | Risk_Flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1303834 | 23 | 3 | single | rented | no | Mechanical_engineer | Rewa | Madhya_Pradesh | 3 | 13 | 0 |
| 1 | 2 | 7574516 | 40 | 10 | single | rented | no | Software_Developer | Parbhani | Maharashtra | 9 | 13 | 0 |
| 2 | 3 | 3991815 | 66 | 4 | married | rented | no | Technical_writer | Alappuzha | Kerala | 4 | 10 | 0 |
| 3 | 4 | 6256451 | 41 | 2 | single | rented | yes | Software_Developer | Bhubaneswar | Odisha | 2 | 12 | 1 |
| 4 | 5 | 5768871 | 47 | 11 | single | rented | no | Civil_servant | Tiruchirappalli[10] | Tamil_Nadu | 3 | 14 | 1 |
df.shape
(252000, 13)
df.rename(columns={'Married/Single':'Marital_status'},inplace=True)
dv = df.duplicated()
print(dv.sum())
df[dv]
0
| Id | Income | Age | Experience | Marital_status | House_Ownership | Car_Ownership | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | Risk_Flag |
|---|
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 252000 entries, 0 to 251999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 252000 non-null int64 1 Income 252000 non-null int64 2 Age 252000 non-null int64 3 Experience 252000 non-null int64 4 Marital_status 252000 non-null object 5 House_Ownership 252000 non-null object 6 Car_Ownership 252000 non-null object 7 Profession 252000 non-null object 8 CITY 252000 non-null object 9 STATE 252000 non-null object 10 CURRENT_JOB_YRS 252000 non-null int64 11 CURRENT_HOUSE_YRS 252000 non-null int64 12 Risk_Flag 252000 non-null int64 dtypes: int64(7), object(6) memory usage: 26.9+ MB
df.nunique()
Id 252000 Income 41920 Age 59 Experience 21 Marital_status 2 House_Ownership 3 Car_Ownership 2 Profession 51 CITY 317 STATE 29 CURRENT_JOB_YRS 15 CURRENT_HOUSE_YRS 5 Risk_Flag 2 dtype: int64
print(df['Experience'].unique())
print(df['Marital_status'].unique())
print(df['House_Ownership'].unique())
print(df['Car_Ownership'].unique())
print(df['Profession'].unique())
print(df['CITY'].unique())
print(df['STATE'].unique())
print(df['CURRENT_JOB_YRS'].unique())
print(df['CURRENT_HOUSE_YRS'].unique())
print(df['Risk_Flag'].unique())
[ 3 10 4 2 11 0 14 17 12 7 9 6 8 1 13 19 15 20 5 16 18] ['single' 'married'] ['rented' 'norent_noown' 'owned'] ['no' 'yes'] ['Mechanical_engineer' 'Software_Developer' 'Technical_writer' 'Civil_servant' 'Librarian' 'Economist' 'Flight_attendant' 'Architect' 'Designer' 'Physician' 'Financial_Analyst' 'Air_traffic_controller' 'Politician' 'Police_officer' 'Artist' 'Surveyor' 'Design_Engineer' 'Chemical_engineer' 'Hotel_Manager' 'Dentist' 'Comedian' 'Biomedical_Engineer' 'Graphic_Designer' 'Computer_hardware_engineer' 'Petroleum_Engineer' 'Secretary' 'Computer_operator' 'Chartered_Accountant' 'Technician' 'Microbiologist' 'Fashion_Designer' 'Aviator' 'Psychologist' 'Magistrate' 'Lawyer' 'Firefighter' 'Engineer' 'Official' 'Analyst' 'Geologist' 'Drafter' 'Statistician' 'Web_designer' 'Consultant' 'Chef' 'Army_officer' 'Surgeon' 'Scientist' 'Civil_engineer' 'Industrial_Engineer' 'Technology_specialist'] ['Rewa' 'Parbhani' 'Alappuzha' 'Bhubaneswar' 'Tiruchirappalli[10]' 'Jalgaon' 'Tiruppur' 'Jamnagar' 'Kota[6]' 'Karimnagar' 'Hajipur[31]' 'Adoni' 'Erode[17]' 'Kollam' 'Madurai' 'Anantapuram[24]' 'Kamarhati' 'Bhusawal' 'Sirsa' 'Amaravati' 'Secunderabad' 'Ahmedabad' 'Ajmer' 'Ongole' 'Miryalaguda' 'Ambattur' 'Indore' 'Pondicherry' 'Shimoga' 'Chennai' 'Gulbarga' 'Khammam' 'Saharanpur' 'Gopalpur' 'Amravati' 'Udupi' 'Howrah' 'Aurangabad[39]' 'Hospet' 'Shimla' 'Khandwa' 'Bidhannagar' 'Bellary' 'Danapur' 'Purnia[26]' 'Bijapur' 'Patiala' 'Malda' 'Sagar' 'Durgapur' 'Junagadh' 'Singrauli' 'Agartala' 'Thanjavur' 'Hindupur' 'Naihati' 'North_Dumdum' 'Panchkula' 'Anantapur' 'Serampore' 'Bathinda' 'Nadiad' 'Kanpur' 'Haridwar' 'Berhampur' 'Jamshedpur' 'Hyderabad' 'Bidar' 'Kottayam' 'Solapur' 'Suryapet' 'Aizawl' 'Asansol' 'Deoghar' 'Eluru[25]' 'Ulhasnagar' 'Aligarh' 'South_Dumdum' 'Berhampore' 'Gandhinagar' 'Sonipat' 'Muzaffarpur' 'Raichur' 'Rajpur_Sonarpur' 'Ambarnath' 'Katihar' 'Kozhikode' 'Vellore' 'Malegaon' 'Kochi' 'Nagaon' 'Nagpur' 'Srinagar' 'Davanagere' 'Bhagalpur' 'Siwan[32]' 'Meerut' 'Dindigul' 'Bhatpara' 'Ghaziabad' 'Kulti' 'Chapra' 'Dibrugarh' 'Panihati' 'Bhiwandi' 'Morbi' 'Kalyan-Dombivli' 'Gorakhpur' 'Panvel' 'Siliguri' 'Bongaigaon' 'Patna' 'Ramgarh' 'Ozhukarai' 'Mirzapur' 'Akola' 'Satna' 'Motihari[34]' 'Jalna' 'Jalandhar' 'Unnao' 'Karnal' 'Cuttack' 'Proddatur' 'Ichalkaranji' 'Warangal[11][12]' 'Jhansi' 'Bulandshahr' 'Narasaraopet' 'Chinsurah' 'Jehanabad[38]' 'Dhanbad' 'Gudivada' 'Gandhidham' 'Raiganj' 'Kishanganj[35]' 'Varanasi' 'Belgaum' 'Tirupati[21][22]' 'Tumkur' 'Coimbatore' 'Kurnool[18]' 'Gurgaon' 'Muzaffarnagar' 'Aurangabad' 'Bhavnagar' 'Arrah' 'Munger' 'Tirunelveli' 'Mumbai' 'Mango' 'Nashik' 'Kadapa[23]' 'Amritsar' 'Khora,_Ghaziabad' 'Ambala' 'Agra' 'Ratlam' 'Surendranagar_Dudhrej' 'Delhi_city' 'Bhopal' 'Hapur' 'Rohtak' 'Durg' 'Korba' 'Bangalore' 'Shivpuri' 'Thrissur' 'Vijayanagaram' 'Farrukhabad' 'Nangloi_Jat' 'Madanapalle' 'Thoothukudi' 'Nagercoil' 'Gaya' 'Chandigarh_city' 'Jammu[16]' 'Kakinada' 'Dewas' 'Bhalswa_Jahangir_Pur' 'Baranagar' 'Firozabad' 'Phusro' 'Allahabad' 'Guna' 'Thane' 'Etawah' 'Vasai-Virar' 'Pallavaram' 'Morena' 'Ballia' 'Surat' 'Burhanpur' 'Phagwara' 'Mau' 'Mangalore' 'Alwar' 'Mahbubnagar' 'Maheshtala' 'Hazaribagh' 'Bihar_Sharif' 'Faridabad' 'Lucknow' 'Tenali' 'Barasat' 'Amroha' 'Giridih' 'Begusarai' 'Medininagar' 'Rajahmundry[19][20]' 'Saharsa[29]' 'New_Delhi' 'Bhilai' 'Moradabad' 'Machilipatnam' 'Mira-Bhayandar' 'Pali' 'Navi_Mumbai' 'Mehsana' 'Imphal' 'Kolkata' 'Sambalpur' 'Ujjain' 'Madhyamgram' 'Jabalpur' 'Jamalpur[36]' 'Ludhiana' 'Bareilly' 'Gangtok' 'Anand' 'Dehradun' 'Pune' 'Satara' 'Srikakulam' 'Raipur' 'Jodhpur' 'Darbhanga' 'Nizamabad' 'Nandyal' 'Dehri[30]' 'Jorhat' 'Ranchi' 'Kumbakonam' 'Guntakal' 'Haldia' 'Loni' 'Pimpri-Chinchwad' 'Rajkot' 'Nanded' 'Noida' 'Kirari_Suleman_Nagar' 'Jaunpur' 'Bilaspur' 'Sambhal' 'Dhule' 'Rourkela' 'Thiruvananthapuram' 'Dharmavaram' 'Nellore[14][15]' 'Visakhapatnam[4]' 'Karawal_Nagar' 'Jaipur' 'Avadi' 'Bhimavaram' 'Bardhaman' 'Silchar' 'Buxar[37]' 'Kavali' 'Tezpur' 'Ramagundam[27]' 'Yamunanagar' 'Sri_Ganganagar' 'Sasaram[30]' 'Sikar' 'Bally' 'Bhiwani' 'Rampur' 'Uluberia' 'Sangli-Miraj_&_Kupwad' 'Hosur' 'Bikaner' 'Shahjahanpur' 'Sultan_Pur_Majra' 'Vijayawada' 'Bharatpur' 'Tadepalligudem' 'Tinsukia' 'Salem' 'Mathura' 'Guntur[13]' 'Hubli–Dharwad' 'Guwahati' 'Chittoor[28]' 'Tiruvottiyur' 'Vadodara' 'Ahmednagar' 'Fatehpur' 'Bhilwara' 'Kharagpur' 'Bettiah[33]' 'Bhind' 'Bokaro' 'Karaikudi' 'Raebareli' 'Pudukkottai' 'Udaipur' 'Mysore[7][8][9]' 'Panipat' 'Latur' 'Tadipatri' 'Bahraich' 'Orai' 'Raurkela_Industrial_Township' 'Gwalior' 'Katni' 'Chandrapur' 'Kolhapur'] ['Madhya_Pradesh' 'Maharashtra' 'Kerala' 'Odisha' 'Tamil_Nadu' 'Gujarat' 'Rajasthan' 'Telangana' 'Bihar' 'Andhra_Pradesh' 'West_Bengal' 'Haryana' 'Puducherry' 'Karnataka' 'Uttar_Pradesh' 'Himachal_Pradesh' 'Punjab' 'Tripura' 'Uttarakhand' 'Jharkhand' 'Mizoram' 'Assam' 'Jammu_and_Kashmir' 'Delhi' 'Chhattisgarh' 'Chandigarh' 'Uttar_Pradesh[5]' 'Manipur' 'Sikkim'] [ 3 9 4 2 0 8 11 5 7 6 12 1 10 13 14] [13 10 12 14 11] [0 1]
(df.isnull().sum()/df.shape[0])*100
Id 0.0 Income 0.0 Age 0.0 Experience 0.0 Marital_status 0.0 House_Ownership 0.0 Car_Ownership 0.0 Profession 0.0 CITY 0.0 STATE 0.0 CURRENT_JOB_YRS 0.0 CURRENT_HOUSE_YRS 0.0 Risk_Flag 0.0 dtype: float64
sns.heatmap(df.isnull())
plt.show()
df.drop(columns=['Id'],inplace=True)
#df.dropna(axis=0,how='any', inplace= True)
print(df.shape)
df.head()
(252000, 12)
| Income | Age | Experience | Marital_status | House_Ownership | Car_Ownership | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | Risk_Flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1303834 | 23 | 3 | single | rented | no | Mechanical_engineer | Rewa | Madhya_Pradesh | 3 | 13 | 0 |
| 1 | 7574516 | 40 | 10 | single | rented | no | Software_Developer | Parbhani | Maharashtra | 9 | 13 | 0 |
| 2 | 3991815 | 66 | 4 | married | rented | no | Technical_writer | Alappuzha | Kerala | 4 | 10 | 0 |
| 3 | 6256451 | 41 | 2 | single | rented | yes | Software_Developer | Bhubaneswar | Odisha | 2 | 12 | 1 |
| 4 | 5768871 | 47 | 11 | single | rented | no | Civil_servant | Tiruchirappalli[10] | Tamil_Nadu | 3 | 14 | 1 |
df.describe(include='all')
| Income | Age | Experience | Marital_status | House_Ownership | Car_Ownership | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | Risk_Flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.520000e+05 | 252000.000000 | 252000.000000 | 252000 | 252000 | 252000 | 252000 | 252000 | 252000 | 252000.000000 | 252000.000000 | 252000.000000 |
| unique | NaN | NaN | NaN | 2 | 3 | 2 | 51 | 317 | 29 | NaN | NaN | NaN |
| top | NaN | NaN | NaN | single | rented | no | Physician | Vijayanagaram | Uttar_Pradesh | NaN | NaN | NaN |
| freq | NaN | NaN | NaN | 226272 | 231898 | 176000 | 5957 | 1259 | 28400 | NaN | NaN | NaN |
| mean | 4.997117e+06 | 49.954071 | 10.084437 | NaN | NaN | NaN | NaN | NaN | NaN | 6.333877 | 11.997794 | 0.123000 |
| std | 2.878311e+06 | 17.063855 | 6.002590 | NaN | NaN | NaN | NaN | NaN | NaN | 3.647053 | 1.399037 | 0.328438 |
| min | 1.031000e+04 | 21.000000 | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 | 10.000000 | 0.000000 |
| 25% | 2.503015e+06 | 35.000000 | 5.000000 | NaN | NaN | NaN | NaN | NaN | NaN | 3.000000 | 11.000000 | 0.000000 |
| 50% | 5.000694e+06 | 50.000000 | 10.000000 | NaN | NaN | NaN | NaN | NaN | NaN | 6.000000 | 12.000000 | 0.000000 |
| 75% | 7.477502e+06 | 65.000000 | 15.000000 | NaN | NaN | NaN | NaN | NaN | NaN | 9.000000 | 13.000000 | 0.000000 |
| max | 9.999938e+06 | 79.000000 | 20.000000 | NaN | NaN | NaN | NaN | NaN | NaN | 14.000000 | 14.000000 | 1.000000 |
df.hist(column=['Income','Age','Experience','CURRENT_JOB_YRS','CURRENT_HOUSE_YRS'],bins="auto",edgecolor="black",align="left",grid=False,figsize=(10, 6))
plt.show()
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.histplot(df,x='Income',bins=30,kde=True, edgecolor='black')
plt.subplot(1,2,2)
sns.histplot(df,x='Age',bins=range(min(df['Age']), max(df['Age']) + 1),kde=True, edgecolor='black')
plt.show()
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
df.boxplot(column=["Income"],vert=False,showmeans=True)
plt.subplot(1,2,2)
df.boxplot(column=["Age"],vert=False,showmeans=True)
plt.show()
variables = ['Risk_Flag', 'Marital_status', 'Car_Ownership', 'House_Ownership','CURRENT_HOUSE_YRS','CURRENT_JOB_YRS','Experience']
plt.figure(figsize=(10, 13))
for i, var in enumerate(variables, start=1):
plt.subplot(4, 2, i)
sns.countplot(data=df, y=var)
plt.tight_layout()
plt.show()
variables = ['Profession','STATE']
plt.figure(figsize=(12, 18))
for i, var in enumerate(variables, start=1):
plt.subplot(1, 2, i)
sns.countplot(data=df, y=var)
plt.tight_layout()
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Function to generate top N plot
def generate_top_n_plot(data, title, xlabel, n=10):
# Calculate frequency counts
freq_counts = data.value_counts().reset_index().rename(columns={'index': xlabel, data.name: 'Count'})
# Select top N values
top_n_values = freq_counts.head(n)
# Plot top N values
plt.figure(figsize=(10, 6))
sns.barplot(x='Count', y=xlabel, data=top_n_values, palette='coolwarm')
plt.xlabel('Count')
plt.ylabel(xlabel)
plt.title(f'Top {n} {xlabel} Counts')
plt.tight_layout()
plt.show()
# Generate top N plots for Profession, CITY, and STATE columns
generate_top_n_plot(df['Profession'], 'Top N Profession Counts', 'Profession', n=10)
generate_top_n_plot(df['CITY'], 'Top N City Counts', 'City', n=10)
generate_top_n_plot(df['STATE'], 'Top N State Counts', 'State', n=10)
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Function to generate word cloud
def generate_wordcloud(data, title):
# Concatenate all words
all_words = ' '.join(data)
# Create word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
# Display word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(title)
plt.axis('off')
plt.show()
# Generate word clouds for each column
generate_wordcloud(df['Profession'], 'Profession Word Cloud')
generate_wordcloud(df['CITY'], 'City Word Cloud')
generate_wordcloud(df['STATE'], 'State Word Cloud')
#sns.pairplot(df,hue="Risk_Flag")
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.histplot(df,x='Income',bins=30,hue='Risk_Flag',kde=True, edgecolor='black')
plt.subplot(1,2,2)
sns.histplot(df,x='Age',bins=range(min(df['Age']), max(df['Age']) + 1),hue='Risk_Flag',kde=True, edgecolor='black')
plt.show()
variables = ['Marital_status', 'Car_Ownership', 'House_Ownership','CURRENT_HOUSE_YRS','CURRENT_JOB_YRS','Experience']
plt.figure(figsize=(10, 13))
for i, var in enumerate(variables, start=1):
plt.subplot(4, 2, i)
sns.countplot(data=df, x=var,hue='Risk_Flag')
plt.tight_layout()
plt.show()
columns = ['CITY', 'STATE', 'Profession']
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
axs = axs.flatten()
for i, col in enumerate(columns):
df_top_n = df[df[col].isin(df[col].value_counts().head(10).index)]
df_top_n.groupby(col)['Risk_Flag'].value_counts().unstack().plot(kind='bar', stacked=True, colormap='viridis', ax=axs[i])
axs[i].set_xlabel(col.capitalize())
axs[i].set_ylabel('Frequency')
axs[i].set_title('Top {} {}s vs Risk Flag'.format(10, col.capitalize()))
axs[i].tick_params(axis='x', rotation=90)
axs[i].legend(title='Risk Flag', loc='lower right')
plt.tight_layout()
plt.show()
# Aggregate cities by state
city_state_counts = df.groupby('STATE')['CITY'].nunique().sort_values(ascending=False)
# Plot aggregated counts
plt.figure(figsize=(10, 6))
sns.barplot(x=city_state_counts.index, y=city_state_counts.values, palette='viridis')
plt.xlabel('State')
plt.ylabel('Number of Cities')
plt.title('Number of Unique Cities by State')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
# Aggregate cities by state and risk flag
city_state_risk_counts = df.groupby(['STATE', 'Risk_Flag'])['CITY'].nunique().unstack()
# Plot aggregated counts
plt.figure(figsize=(12, 6))
city_state_risk_counts.plot(kind='bar', stacked=True, colormap='viridis')
plt.xlabel('State')
plt.ylabel('Number of Cities')
plt.title('Number of Unique Cities by State and Risk Flag')
plt.xticks(rotation=90)
plt.legend(title='Risk Flag')
plt.tight_layout()
plt.show()
<Figure size 1200x600 with 0 Axes>
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Profession'] = le.fit_transform(df['Profession'])
df['CITY'] = le.fit_transform(df['CITY'])
df['STATE'] = le.fit_transform(df['STATE'])
df = pd.get_dummies(df, drop_first=True)
df.head()
| Income | Age | Experience | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | Risk_Flag | Marital_status_single | House_Ownership_owned | House_Ownership_rented | Car_Ownership_yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1303834 | 23 | 3 | 33 | 251 | 13 | 3 | 13 | 0 | 1 | 0 | 1 | 0 |
| 1 | 7574516 | 40 | 10 | 43 | 227 | 14 | 9 | 13 | 0 | 1 | 0 | 1 | 0 |
| 2 | 3991815 | 66 | 4 | 47 | 8 | 12 | 4 | 10 | 0 | 0 | 0 | 1 | 0 |
| 3 | 6256451 | 41 | 2 | 43 | 54 | 17 | 2 | 12 | 1 | 1 | 0 | 1 | 1 |
| 4 | 5768871 | 47 | 11 | 11 | 296 | 22 | 3 | 14 | 1 | 1 | 0 | 1 | 0 |
# one hot encoding for nominal data
#dummies=pd.get_dummies(X_train[['Marital_status','House_Ownership','Car_Ownership']],drop_first=True)
#X_train=pd.concat([X_train,dummies],axis="columns")
#X_train=X_train.drop(columns=['Marital_status','House_Ownership','Car_Ownership'])
#X_train.head()
X = df.drop(["Risk_Flag"],axis=1)
y = df["Risk_Flag"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)
(201600, 12) (50400, 12) (201600,) (50400,)
# Normlization
from sklearn.preprocessing import MinMaxScaler
norm = MinMaxScaler().fit(X_train)
X_train_norm=pd.DataFrame(norm.transform(X_train),index=X_train.index, columns=X_train.columns)
X_train_norm.head()
| Income | Age | Experience | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | Marital_status_single | House_Ownership_owned | House_Ownership_rented | Car_Ownership_yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 200471 | 0.912051 | 0.844828 | 0.90 | 0.08 | 0.854430 | 0.714286 | 0.214286 | 0.25 | 1.0 | 0.0 | 1.0 | 0.0 |
| 92611 | 0.262896 | 0.310345 | 0.00 | 0.26 | 0.984177 | 0.785714 | 0.000000 | 0.25 | 1.0 | 0.0 | 1.0 | 0.0 |
| 86397 | 0.100127 | 0.310345 | 0.45 | 0.84 | 0.389241 | 0.821429 | 0.285714 | 0.75 | 1.0 | 0.0 | 1.0 | 0.0 |
| 110500 | 0.389116 | 0.000000 | 0.10 | 0.96 | 0.933544 | 0.035714 | 0.142857 | 0.50 | 1.0 | 0.0 | 1.0 | 0.0 |
| 185133 | 0.946972 | 0.344828 | 0.25 | 1.00 | 0.177215 | 0.392857 | 0.357143 | 0.75 | 1.0 | 0.0 | 0.0 | 1.0 |
data = pd.concat([X_train_norm, y_train], axis=1)
data.corr()
| Income | Age | Experience | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | Marital_status_single | House_Ownership_owned | House_Ownership_rented | Car_Ownership_yes | Risk_Flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Income | 1.000000 | -0.002466 | 0.008273 | 0.000553 | -0.002330 | -0.006887 | 0.008356 | -0.003107 | 0.003193 | 0.001680 | 0.011155 | 0.003712 | -0.002447 |
| Age | -0.002466 | 1.000000 | -0.000095 | -0.008217 | 0.002690 | -0.004538 | 0.002961 | -0.018993 | -0.006075 | 0.017387 | -0.013377 | 0.007792 | -0.018032 |
| Experience | 0.008273 | -0.000095 | 1.000000 | 0.000055 | -0.026464 | -0.000633 | 0.644987 | 0.018932 | -0.002599 | 0.014209 | -0.011364 | 0.008769 | -0.035569 |
| Profession | 0.000553 | -0.008217 | 0.000055 | 1.000000 | 0.018251 | 0.000581 | -0.004643 | 0.000330 | -0.007818 | -0.010184 | 0.005536 | 0.011780 | -0.006362 |
| CITY | -0.002330 | 0.002690 | -0.026464 | 0.018251 | 1.000000 | -0.038217 | -0.027255 | -0.009860 | 0.012715 | 0.012001 | -0.014917 | 0.000923 | 0.005170 |
| STATE | -0.006887 | -0.004538 | -0.000633 | 0.000581 | -0.038217 | 1.000000 | 0.008471 | 0.006612 | -0.009693 | 0.020623 | -0.013333 | 0.026765 | -0.003986 |
| CURRENT_JOB_YRS | 0.008356 | 0.002961 | 0.644987 | -0.004643 | -0.027255 | 0.008471 | 1.000000 | 0.005821 | 0.004809 | 0.007610 | -0.010560 | 0.012800 | -0.017006 |
| CURRENT_HOUSE_YRS | -0.003107 | -0.018993 | 0.018932 | 0.000330 | -0.009860 | 0.006612 | 0.005821 | 1.000000 | -0.006836 | 0.015542 | -0.009812 | 0.000555 | -0.004627 |
| Marital_status_single | 0.003193 | -0.006075 | -0.002599 | -0.007818 | 0.012715 | -0.009693 | 0.004809 | -0.006836 | 1.000000 | 0.023850 | -0.024656 | -0.000259 | 0.021842 |
| House_Ownership_owned | 0.001680 | 0.017387 | 0.014209 | -0.010184 | 0.012001 | 0.020623 | 0.007610 | 0.015542 | 0.023850 | 1.000000 | -0.788453 | 0.002761 | -0.023179 |
| House_Ownership_rented | 0.011155 | -0.013377 | -0.011364 | 0.005536 | -0.014917 | -0.013333 | -0.010560 | -0.009812 | -0.024656 | -0.788453 | 1.000000 | -0.001784 | 0.025827 |
| Car_Ownership_yes | 0.003712 | 0.007792 | 0.008769 | 0.011780 | 0.000923 | 0.026765 | 0.012800 | 0.000555 | -0.000259 | 0.002761 | -0.001784 | 1.000000 | -0.023798 |
| Risk_Flag | -0.002447 | -0.018032 | -0.035569 | -0.006362 | 0.005170 | -0.003986 | -0.017006 | -0.004627 | 0.021842 | -0.023179 | 0.025827 | -0.023798 | 1.000000 |
corrmat= data.corr()
top_corr_features= corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")
<Axes: >
Now the variables (which are least correlated with target variable) i.e whose correlation with the target variable is between -0.01 and 0.01 we will consider those variables to be removed from our data.
# Correlation-Based Feature Selection
def correlation_feature_selection(X_train_norm, y_train):
correlations = X_train_norm.corrwith(y_train)
selected_features = correlations[(correlations < -0.01) | (correlations > 0.01)].index.tolist()
return correlations, selected_features
corr, selected_features_corr = correlation_feature_selection(X_train_norm, y_train)
print(corr)
print("\033[1mCorrelation Feature Selection\033[0m:", selected_features_corr)
Income -0.002447
Age -0.018032
Experience -0.035569
Profession -0.006362
CITY 0.005170
STATE -0.003986
CURRENT_JOB_YRS -0.017006
CURRENT_HOUSE_YRS -0.004627
Marital_status_single 0.021842
House_Ownership_owned -0.023179
House_Ownership_rented 0.025827
Car_Ownership_yes -0.023798
dtype: float64
Correlation Feature Selection: ['Age', 'Experience', 'CURRENT_JOB_YRS', 'Marital_status_single', 'House_Ownership_owned', 'House_Ownership_rented', 'Car_Ownership_yes']
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calculate_vif(X):
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif_data
# Call the function to calculate VIF for independent variables
calculate_vif(X_train_norm)
| Feature | VIF | |
|---|---|---|
| 0 | Income | 3.842598 |
| 1 | Age | 3.704013 |
| 2 | Experience | 6.503444 |
| 3 | Profession | 3.764492 |
| 4 | CITY | 3.748792 |
| 5 | STATE | 3.062002 |
| 6 | CURRENT_JOB_YRS | 6.796901 |
| 7 | CURRENT_HOUSE_YRS | 2.942678 |
| 8 | Marital_status_single | 8.377359 |
| 9 | House_Ownership_owned | 1.856473 |
| 10 | House_Ownership_rented | 15.596888 |
| 11 | Car_Ownership_yes | 1.426998 |
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selection(X_train_norm, threshold):
selector = VarianceThreshold(threshold=threshold)
selector.fit(X_train_norm)
selected_features_indices = selector.get_support(indices=True)
selected_features = X_train_norm.columns[selected_features_indices].tolist()
variance_values = X_train_norm.var().tolist() # Calculate variance for each variable
return selected_features, variance_values
selected_features, variance_values = variance_threshold_selection(X_train_norm, threshold=0.1)
# Print selected features
print("\033[1mVariance Threshold Feature Selection\033[0m:")
print(pd.DataFrame({"Feature": X_train_norm.columns, "Variance": variance_values}))
print("\nSelected Features:", selected_features)
Variance Threshold Feature Selection:
Feature Variance
0 Income 0.083157
1 Age 0.086685
2 Experience 0.090024
3 Profession 0.086691
4 CITY 0.085093
5 STATE 0.112098
6 CURRENT_JOB_YRS 0.067834
7 CURRENT_HOUSE_YRS 0.122288
8 Marital_status_single 0.091927
9 House_Ownership_owned 0.048334
10 House_Ownership_rented 0.073146
11 Car_Ownership_yes 0.210689
Selected Features: ['STATE', 'CURRENT_HOUSE_YRS', 'Car_Ownership_yes']
from sklearn.feature_selection import chi2
def chi_square_selection(X_train_norm, y_train, alpha=0.05):
chi2_scores, p_values = chi2(X_train_norm, y_train)
selected_features = X_train_norm.columns[p_values < alpha].tolist()
return chi2_scores, p_values, selected_features
# Call the function to perform Chi-Square feature selection
chi2_scores, p_values, selected_features = chi_square_selection(X_train_norm, y_train)
# Print Chi-Square scores and p-values
print("\033[1mChi-Square Feature Selection\033[0m:")
print(pd.DataFrame({"Feature": X_train_norm.columns, "Chi-Square Scores:": chi2_scores,"p_Values:": p_values}))
# Print selected features
print("\nSelected Features:", selected_features)
Chi-Square Feature Selection:
Feature Chi-Square Scores: p_Values:
0 Income 0.200867 6.540221e-01
1 Age 11.359315 7.507050e-04
2 Experience 45.529208 1.503795e-11
3 Profession 1.399370 2.368292e-01
4 CITY 0.916720 3.383376e-01
5 STATE 0.728246 3.934523e-01
6 CURRENT_JOB_YRS 8.742878 3.108136e-03
7 CURRENT_HOUSE_YRS 1.057219 3.038503e-01
8 Marital_status_single 9.849853 1.698453e-03
9 House_Ownership_owned 102.797657 3.711884e-24
10 House_Ownership_rented 10.684977 1.080091e-03
11 Car_Ownership_yes 79.728245 4.296150e-19
Selected Features: ['Age', 'Experience', 'CURRENT_JOB_YRS', 'Marital_status_single', 'House_Ownership_owned', 'House_Ownership_rented', 'Car_Ownership_yes']
# ANOVA Feature Selection
from sklearn.feature_selection import f_classif
def anova_selection(X_train_norm, y_train, alpha=0.05):
f_scores, p_values = f_classif(X_train_norm, y_train)
selected_features = X_train_norm.columns[p_values < alpha].tolist()
return f_scores, p_values, selected_features
f_scores, p_values,selected_features = anova_selection(X_train_norm, y_train)
print("\033[1mANOVA Feature Selection\033[0m:")
print(pd.DataFrame({"Feature": X_train_norm.columns, "f_scores": f_scores,"p_values": p_values}))
print("\nSelected Features:", selected_features)
ANOVA Feature Selection:
Feature f_scores p_values
0 Income 1.206692 2.719895e-01
1 Age 65.570938 5.636809e-16
2 Experience 255.377571 1.894515e-57
3 Profession 8.160826 4.280925e-03
4 CITY 5.388817 2.026720e-02
5 STATE 3.202353 7.353392e-02
6 CURRENT_JOB_YRS 58.319116 2.238419e-14
7 CURRENT_HOUSE_YRS 4.315660 3.776437e-02
8 Marital_status_single 96.220196 1.040023e-22
9 House_Ownership_owned 108.370967 2.262284e-25
10 House_Ownership_rented 134.559451 4.216880e-31
11 Car_Ownership_yes 114.242627 1.172147e-26
Selected Features: ['Age', 'Experience', 'Profession', 'CITY', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Marital_status_single', 'House_Ownership_owned', 'House_Ownership_rented', 'Car_Ownership_yes']
# Mutual Information Feature Selection
from sklearn.feature_selection import mutual_info_classif
def mutual_information_selection(X_train_norm, y_train, n_features=None):
mi_scores = mutual_info_classif(X_train_norm, y_train)
selected_features_indices = mi_scores.argsort()[-n_features:][::-1] if n_features else mi_scores.argsort()[::-1]
selected_features = X_train_norm.columns[selected_features_indices].tolist()
return mi_scores, selected_features
mi_scores, selected_features = mutual_information_selection(X_train_norm, y_train, n_features=5)
print("\033[1mMutual Information Feature Selection\033[0m:")
print(pd.DataFrame({"Feature": X_train_norm.columns, "mi_scores": mi_scores}))
print("\nSelected Features:", selected_features)
Mutual Information Feature Selection:
Feature mi_scores
0 Income 0.160061
1 Age 0.003399
2 Experience 0.003043
3 Profession 0.002073
4 CITY 0.009578
5 STATE 0.004406
6 CURRENT_JOB_YRS 0.002574
7 CURRENT_HOUSE_YRS 0.007653
8 Marital_status_single 0.028819
9 House_Ownership_owned 0.000759
10 House_Ownership_rented 0.025344
11 Car_Ownership_yes 0.009557
Selected Features: ['Income', 'Marital_status_single', 'House_Ownership_rented', 'CITY', 'Car_Ownership_yes']
# Random Forest Feature Selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
def random_forest_feature_selection(X_train_norm, y_train):
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train_norm, y_train)
feature_importances = clf.feature_importances_
model = SelectFromModel(clf, prefit=True)
selected_features = X_train_norm.columns[model.get_support()].tolist()
return feature_importances, selected_features
feature_importances, selected_features = random_forest_feature_selection(X_train_norm, y_train)
print("\033[1mRandom Forest Feature Selection\033[0m:")
print(pd.DataFrame({"Feature": X_train_norm.columns, "feature_importances": feature_importances}))
print("\nSelected Features:", selected_features)
Random Forest Feature Selection:
Feature feature_importances
0 Income 0.188004
1 Age 0.143938
2 Experience 0.082905
3 Profession 0.146261
4 CITY 0.166537
5 STATE 0.102217
6 CURRENT_JOB_YRS 0.073676
7 CURRENT_HOUSE_YRS 0.064035
8 Marital_status_single 0.009798
9 House_Ownership_owned 0.003313
10 House_Ownership_rented 0.005909
11 Car_Ownership_yes 0.013408
Selected Features: ['Income', 'Age', 'Profession', 'CITY', 'STATE']
# Lasso Feature Selection
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
def lasso_feature_selection(X_train_norm, y_train, C=1.0):
clf = LogisticRegression(penalty='l1', C=C, solver='liblinear')
clf.fit(X_train_norm, y_train)
coefficients = clf.coef_[0]
model = SelectFromModel(clf, prefit=True)
selected_features = X_train_norm.columns[model.get_support()].tolist()
return coefficients, selected_features
coefficients, selected_features = lasso_feature_selection(X_train_norm, y_train)
print("\033[1mLasso Feature Selection\033[0m:")
print(pd.DataFrame({"Feature": X_train_norm.columns, "coefficients": coefficients}))
print("\nSelected Features:", selected_features)
Lasso Feature Selection:
Feature coefficients
0 Income -0.026976
1 Age -0.181849
2 Experience -0.433771
3 Profession -0.065401
4 CITY 0.046263
5 STATE -0.026324
6 CURRENT_JOB_YRS 0.135491
7 CURRENT_HOUSE_YRS -0.032156
8 Marital_status_single 0.236208
9 House_Ownership_owned -0.125784
10 House_Ownership_rented 0.239648
11 Car_Ownership_yes -0.158995
Selected Features: ['Income', 'Age', 'Experience', 'Profession', 'CITY', 'STATE', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Marital_status_single', 'House_Ownership_owned', 'House_Ownership_rented', 'Car_Ownership_yes']
# Recursive Feature Elimination (RFE) Feature Selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
def rfe_feature_selection(X_train_norm, y_train, n_features_to_select=None):
clf = LogisticRegression()
rfe = RFE(estimator=clf, n_features_to_select=n_features_to_select)
rfe.fit(X_train_norm, y_train)
feature_ranking = rfe.ranking_
selected_features = X_train_norm.columns[rfe.support_].tolist()
return feature_ranking, selected_features
feature_ranking, selected_features = rfe_feature_selection(X_train_norm, y_train, n_features_to_select=12)
# Print the RFE feature selection results
print("\033[1mRFE Feature Selection\033[0m:")
print(pd.DataFrame({"Feature": X_train_norm.columns, "Ranking": feature_ranking}))
print(selected_features)
RFE Feature Selection:
Feature Ranking
0 Income 1
1 Age 1
2 Experience 1
3 Profession 1
4 CITY 1
5 STATE 1
6 CURRENT_JOB_YRS 1
7 CURRENT_HOUSE_YRS 1
8 Marital_status_single 1
9 House_Ownership_owned 1
10 House_Ownership_rented 1
11 Car_Ownership_yes 1
['Income', 'Age', 'Experience', 'Profession', 'CITY', 'STATE', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Marital_status_single', 'House_Ownership_owned', 'House_Ownership_rented', 'Car_Ownership_yes']
from sklearn.ensemble import ExtraTreesRegressor
model=ExtraTreesRegressor()
model.fit(X_train_norm,y_train)
feat_imp =pd.Series(model.feature_importances_, index=X_train_norm.columns)
feat_imp.nlargest(30).plot(kind='barh')
plt.rcParams['figure.figsize']=(10,7)
plt.show()
For all the feature selection techniques applied above, there are few features that regularly showed up to be removed from our data hence we remove the following 10 less useful variables:
X_train_norm.drop(columns=['House_Ownership_rented','Marital_status_single'],inplace=True)
X_train_norm.head()
| Income | Age | Experience | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | House_Ownership_owned | Car_Ownership_yes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 200471 | 0.912051 | 0.844828 | 0.90 | 0.08 | 0.854430 | 0.714286 | 0.214286 | 0.25 | 0.0 | 0.0 |
| 92611 | 0.262896 | 0.310345 | 0.00 | 0.26 | 0.984177 | 0.785714 | 0.000000 | 0.25 | 0.0 | 0.0 |
| 86397 | 0.100127 | 0.310345 | 0.45 | 0.84 | 0.389241 | 0.821429 | 0.285714 | 0.75 | 0.0 | 0.0 |
| 110500 | 0.389116 | 0.000000 | 0.10 | 0.96 | 0.933544 | 0.035714 | 0.142857 | 0.50 | 0.0 | 0.0 |
| 185133 | 0.946972 | 0.344828 | 0.25 | 1.00 | 0.177215 | 0.392857 | 0.357143 | 0.75 | 0.0 | 1.0 |
print(X_train_norm.shape,X_test.shape)
print(y_train.shape,y_test.shape)
(201600, 10) (50400, 12) (201600,) (50400,)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
LR = LogisticRegression(C=0.01,solver='liblinear').fit(X_train_norm,y_train)
LR
LogisticRegression(C=0.01, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=0.01, solver='liblinear')
print(classification_report(y_train, LR.predict(X_train_norm),zero_division=0))
confusion_matrix(y_train,LR.predict(X_train_norm))
precision recall f1-score support
0 0.88 1.00 0.93 176857
1 0.00 0.00 0.00 24743
accuracy 0.88 201600
macro avg 0.44 0.50 0.47 201600
weighted avg 0.77 0.88 0.82 201600
array([[176857, 0],
[ 24743, 0]], dtype=int64)
# Normlization
X_test_norm = pd.DataFrame(norm.transform(X_test), index=X_test.index, columns=X_test.columns)
X_test_norm.drop(columns=['House_Ownership_rented','Marital_status_single'],inplace=True)
X_test_norm.head()
| Income | Age | Experience | Profession | CITY | STATE | CURRENT_JOB_YRS | CURRENT_HOUSE_YRS | House_Ownership_owned | Car_Ownership_yes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 75255 | 0.425508 | 0.448276 | 0.90 | 0.78 | 0.373418 | 0.000000 | 0.500000 | 0.75 | 0.0 | 0.0 |
| 192435 | 0.852788 | 0.396552 | 0.10 | 0.26 | 0.664557 | 0.000000 | 0.142857 | 0.00 | 0.0 | 0.0 |
| 154839 | 0.784648 | 0.586207 | 0.45 | 0.86 | 0.724684 | 0.071429 | 0.642857 | 0.75 | 0.0 | 0.0 |
| 59774 | 0.848999 | 0.689655 | 1.00 | 0.24 | 0.360759 | 1.000000 | 0.571429 | 0.25 | 0.0 | 0.0 |
| 63788 | 0.152854 | 0.982759 | 0.65 | 1.00 | 0.231013 | 0.785714 | 0.642857 | 0.50 | 0.0 | 0.0 |
yhat = LR.predict(X_test_norm)
yhat
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
yhat_prob = LR.predict_proba(X_test_norm)
yhat_prob
array([[0.8874984 , 0.1125016 ],
[0.84256025, 0.15743975],
[0.87335595, 0.12664405],
...,
[0.8624693 , 0.1375307 ],
[0.90680209, 0.09319791],
[0.87194441, 0.12805559]])
yhat_prob[:,1]
array([0.1125016 , 0.15743975, 0.12664405, ..., 0.1375307 , 0.09319791,
0.12805559])
pd.crosstab(y_test,yhat)
| col_0 | 0 |
|---|---|
| Risk_Flag | |
| 0 | 44147 |
| 1 | 6253 |
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
def model_evaluation(model):
y_pred = model.predict(X_test_norm)
cm = confusion_matrix(y_test, y_pred)
cm_matrix = pd.DataFrame({'Actual Positive:1': [cm[1, 1], cm[1, 0]], 'Actual Negative:0': [cm[0, 1], cm[0, 0]]},
index=['Predict Positive:1', 'Predict Negative:0'])
print(classification_report(y_test, y_pred,zero_division=0))
plt.figure(figsize=(2, 2))
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='Blues')
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
print('accuracy: ',(TP+TN)/(TP+TN+FP+FN) if (TP+TN+FP+FN) != 0 else np.nan)
print('precision: ',(TP/(TP+FP)) if (TP+FP) != 0 else np.nan)
print('recall: ',(TP/(TP+FN)) if (TP+FN) != 0 else np.nan)
print('classification_error',(FP + FN) / float(TP + TN + FP + FN) if (TP + TN + FP + FN) != 0 else np.nan)
print(pd.DataFrame({'Metrics': [TP, TN, FP, FN]}, index=['TP', 'TN', 'FP', 'FN']))
plt.show()
print("\n","\n")
model_evaluation(LR)
precision recall f1-score support
0 0.88 1.00 0.93 44147
1 0.00 0.00 0.00 6253
accuracy 0.88 50400
macro avg 0.44 0.50 0.47 50400
weighted avg 0.77 0.88 0.82 50400
accuracy: 0.8759325396825397
precision: nan
recall: 0.0
classification_error 0.12406746031746031
Metrics
TP 0
TN 44147
FP 0
FN 6253
from sklearn.metrics import roc_auc_score
step_factor = 0.005
threshold_value = 0.1
roc_score=0
predicted_proba = LR.predict_proba(X_test_norm) #probability of prediction
while threshold_value <=0.2: #continue to check best threshold upto probability 0.8
temp_thresh = threshold_value
predicted = (predicted_proba [:,1] >= temp_thresh).astype('int') #change the class boundary for prediction
print('Threshold',temp_thresh,'--',roc_auc_score(y_test, predicted))
if roc_score<roc_auc_score(y_test, predicted): #store the threshold for best classification
roc_score = roc_auc_score(y_test, predicted)
thrsh_score = threshold_value
threshold_value = threshold_value + step_factor
print('---Optimum Threshold ---',thrsh_score,'--ROC--',roc_score)
Threshold 0.1 -- 0.5143158212275201 Threshold 0.10500000000000001 -- 0.5210834355719189 Threshold 0.11000000000000001 -- 0.5271140434963745 Threshold 0.11500000000000002 -- 0.5327100363787237 Threshold 0.12000000000000002 -- 0.5347203700345564 Threshold 0.12500000000000003 -- 0.5314378248779227 Threshold 0.13000000000000003 -- 0.5228348045779668 Threshold 0.13500000000000004 -- 0.5217335197079443 Threshold 0.14000000000000004 -- 0.518516353729479 Threshold 0.14500000000000005 -- 0.5198057902963368 Threshold 0.15000000000000005 -- 0.5146064774630876 Threshold 0.15500000000000005 -- 0.5090353531566542 Threshold 0.16000000000000006 -- 0.5030960000458756 Threshold 0.16500000000000006 -- 0.5026186284412735 Threshold 0.17000000000000007 -- 0.500804383415973 Threshold 0.17500000000000007 -- 0.4999320452125852 Threshold 0.18000000000000008 -- 0.5 Threshold 0.18500000000000008 -- 0.5 Threshold 0.19000000000000009 -- 0.5 Threshold 0.1950000000000001 -- 0.5 ---Optimum Threshold --- 0.12000000000000002 --ROC-- 0.5347203700345564
from sklearn.metrics import precision_recall_curve
precision, recall, threshold = precision_recall_curve(y_test,yhat_prob[:,1])
import plotly.offline as py
import plotly.express as px
fig = px.line(x=recall[:-1], y = precision[:-1], hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig
from sklearn.metrics import auc
auc_1= auc(recall, precision)
auc_1
0.1428763260635985
y_pred_new_threshold = (LR.predict_proba(X_test_norm)[:, 1] >= 0.12).astype(int)
print(y_pred_new_threshold)
print(classification_report(y_test, y_pred_new_threshold,zero_division=0))
pd.crosstab(y_test,y_pred_new_threshold)
[0 1 1 ... 1 0 1]
precision recall f1-score support
0 0.89 0.43 0.59 44147
1 0.14 0.63 0.23 6253
accuracy 0.46 50400
macro avg 0.52 0.53 0.41 50400
weighted avg 0.80 0.46 0.54 50400
| col_0 | 0 | 1 |
|---|---|---|
| Risk_Flag | ||
| 0 | 19198 | 24949 |
| 1 | 2285 | 3968 |
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100).fit(X_train_norm,y_train)
print(rf)
confusion_matrix(y_train,rf.predict(X_train_norm))
RandomForestClassifier()
array([[168878, 7979],
[ 4845, 19898]], dtype=int64)
y_pred = rf.predict(X_test_norm)
y_pred
array([0, 0, 0, ..., 1, 0, 0], dtype=int64)
model_evaluation(rf)
precision recall f1-score support
0 0.94 0.95 0.94 44147
1 0.60 0.53 0.57 6253
accuracy 0.90 50400
macro avg 0.77 0.74 0.76 50400
weighted avg 0.89 0.90 0.90 50400
accuracy: 0.898968253968254
precision: 0.6049918610960391
recall: 0.5349432272509196
classification_error 0.10103174603174603
Metrics
TP 3345
TN 41963
FP 2184
FN 2908
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_train_smote, y_train_smote = smote.fit_resample(X_train_norm,y_train)
from collections import Counter
print("Before SMOTE :" , Counter(y_train))
print("After SMOTE :" , Counter(y_train_smote))
Before SMOTE : Counter({0: 176857, 1: 24743})
After SMOTE : Counter({0: 176857, 1: 176857})
LRSMOTE = LogisticRegression(C=0.01,solver='liblinear').fit(X_train_smote, y_train_smote)
print(LRSMOTE)
confusion_matrix(y_train_smote,LRSMOTE.predict(X_train_smote))
LogisticRegression(C=0.01, solver='liblinear')
array([[ 88089, 88768],
[ 76270, 100587]], dtype=int64)
yhat = LRSMOTE.predict(X_test_norm)
print(yhat)
yhat_prob = LRSMOTE.predict_proba(X_test_norm)
print(yhat_prob[:,1])
[0 1 1 ... 1 0 1] [0.46896669 0.56117115 0.52007521 ... 0.52896379 0.41403099 0.51214173]
pd.crosstab(y_test,yhat)
model_evaluation(LRSMOTE)
precision recall f1-score support
0 0.89 0.50 0.64 44147
1 0.14 0.57 0.22 6253
accuracy 0.50 50400
macro avg 0.51 0.53 0.43 50400
weighted avg 0.80 0.50 0.58 50400
accuracy: 0.5042460317460318
precision: 0.13744919682601123
recall: 0.5678874140412602
classification_error 0.4957539682539683
Metrics
TP 3551
TN 21863
FP 22284
FN 2702
rfsmote = RandomForestClassifier(n_estimators=100).fit(X_train_smote, y_train_smote)
print(rfsmote)
confusion_matrix(y_train_smote,LRSMOTE.predict(X_train_smote))
RandomForestClassifier()
array([[ 88089, 88768],
[ 76270, 100587]], dtype=int64)
y_pred = rfsmote.predict(X_test_norm)
print(y_pred)
y_pred_prob = rfsmote.predict_proba(X_test_norm)
print(y_pred_prob)
model_evaluation(rfsmote)
[0 0 0 ... 1 0 0]
[[1. 0. ]
[1. 0. ]
[1. 0. ]
...
[0.09339455 0.90660545]
[1. 0. ]
[0.59610456 0.40389544]]
precision recall f1-score support
0 0.97 0.91 0.94 44147
1 0.55 0.77 0.64 6253
accuracy 0.89 50400
macro avg 0.76 0.84 0.79 50400
weighted avg 0.91 0.89 0.90 50400
accuracy: 0.8923015873015873
precision: 0.5467846206192583
recall: 0.7709899248360786
classification_error 0.1076984126984127
Metrics
TP 4821
TN 40151
FP 3996
FN 1432
from sklearn.metrics import precision_recall_curve
precision, recall, threshold = precision_recall_curve(y_test,y_pred_prob[:,1])
import plotly.offline as py
import plotly.express as px
fig = px.line(x=recall[:-1], y = precision[:-1], hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig
from sklearn.metrics import auc
auc_1= auc(recall, precision)
auc_1
0.6090355427576696
since, it gives Accuracy = 0.89, precision: 0.55 , recall: 0.77 & classification_error 0.11 which is minimum compared with logistic model(smote)
#import lazypredict
#from lazypredict.Supervised import LazyClassifier
# Define high memory classifiers to avoid memory issues
#highmem_classifiers = ["LabelSpreading","LabelPropagation","BernoulliNB",'SVC',"NearestCentroid","NuSVC","KNeighborsClassifier", "ElasticNetClassifier", "GradientBoostingClassifier", "HistGradientBoostingClassifier"]
#classifiers = [c for c in lazypredict.Supervised.CLASSIFIERS if c[0] not in highmem_classifiers]
# Run LazyPredict to find the best performing classifier
#clf = LazyClassifier(classifiers=classifiers,verbose=0,ignore_warnings=True)
#models,predictions = clf.fit(X_train_norm,X_test_norm,y_train,y_test)
#models # Print the list of models tested by LazyPredict